Working with geospatial data

Based on Chapter 17 from Modern Data Science with R.

You can download this .qmd file from here. Just hit the Download Raw File button.

# Initial packages required (we'll be adding more)
library(tidyverse)
library(mdsr)      # package associated with our MDSR book
library(sf)        
# sf = support for simple features, a standardized way to encode spatial vector data
library(ggspatial)

Our goal in “Maps - Part 2” is to learn about how to work with shapefiles, which are an open data structure for encoding spatial information. We will learn about projections (from three-dimensional space into two-dimensional space) and how to create informative, spatially-aware visualizations. We will just skim the surface in 264; for a much more thorough coverage, take our Spatial Statistics course!

Section 17.4: Extended example: NC Congressional Districts

In North Carolina, there are about the same number of Democratic and Republican voters in the state. In the fall of 2020, 10 of North Carolina’s 13 congressional representatives were Republican (with one seat currently vacant). How can this be? In this case, geospatial data can help us understand.

Note: the seats are currently 7 and 7 (NC earned an additional seat for 2022 after the 2020 Census), but 3 are expected to flip back to Republicans again after yet another round of questionable redistricting

# To install fec 12 the first time, uncomment the code below (you might have to install devtools as well):
# devtools::install_github("baumer-lab/fec12")
library(fec12)
print(results_house, width = Inf)
# A tibble: 2,343 × 13
   state district_id cand_id   incumbent party primary_votes primary_percent
   <chr> <chr>       <chr>     <lgl>     <chr>         <dbl>           <dbl>
 1 AL    01          H2AL01077 TRUE      R             48702          0.555 
 2 AL    01          H2AL01176 FALSE     R             21308          0.243 
 3 AL    01          H2AL01184 FALSE     R             13809          0.158 
 4 AL    01          H0AL01030 FALSE     R              3854          0.0440
 5 AL    02          H0AL02087 TRUE      R                NA         NA     
 6 AL    02          H2AL02141 FALSE     D                NA         NA     
 7 AL    03          H2AL03032 TRUE      R                NA         NA     
 8 AL    03          H2AL03099 FALSE     D                NA         NA     
 9 AL    04          H6AL04098 TRUE      R                NA         NA     
10 AL    04          H2AL04055 FALSE     D             10971          0.514 
   runoff_votes runoff_percent general_votes general_percent won   footnotes
          <dbl>          <dbl>         <dbl>           <dbl> <lgl> <chr>    
 1           NA             NA        196374           0.979 TRUE  <NA>     
 2           NA             NA            NA          NA     FALSE <NA>     
 3           NA             NA            NA          NA     FALSE <NA>     
 4           NA             NA            NA          NA     FALSE <NA>     
 5           NA             NA        180591           0.636 TRUE  <NA>     
 6           NA             NA        103092           0.363 FALSE <NA>     
 7           NA             NA        175306           0.640 TRUE  <NA>     
 8           NA             NA         98141           0.358 FALSE <NA>     
 9           NA             NA        199071           0.740 TRUE  <NA>     
10           NA             NA         69706           0.259 FALSE <NA>     
# ℹ 2,333 more rows
results_house |>
  group_by(state, district_id) |>
  summarize(N = n())
# A tibble: 445 × 3
# Groups:   state [56]
   state district_id     N
   <chr> <chr>       <int>
 1 AK    00             10
 2 AL    01              4
 3 AL    02              2
 4 AL    03              2
 5 AL    04              3
 6 AL    05              3
 7 AL    06              6
 8 AL    07              3
 9 AR    01              6
10 AR    02              4
# ℹ 435 more rows

[Pause to ponder:] Why are there 435 Representatives in the US House but 445 state/district combinations in our data? And how should we handle cases in which there’s just not 1 Democrat vs 1 Republican?

# summary of the 13 congressional NC districts and the 2012 voting
district_elections <- results_house |>
  mutate(district = parse_number(district_id)) |>
  group_by(state, district) |>
  summarize(
    N = n(), 
    total_votes = sum(general_votes, na.rm = TRUE),
    d_votes = sum(ifelse(party == "D", general_votes, 0), na.rm = TRUE),
    r_votes = sum(ifelse(party == "R", general_votes, 0), na.rm = TRUE),
    .groups = "drop"
  ) |>
  mutate(
    other_votes = total_votes - d_votes - r_votes,
    r_prop = r_votes / total_votes,  
    winner = ifelse(r_votes > d_votes, "Republican", "Democrat")
  )
nc_results <- district_elections |>
  filter(state == "NC")
nc_results |>                  
  select(-state)
# A tibble: 13 × 8
   district     N total_votes d_votes r_votes other_votes r_prop winner    
      <dbl> <int>       <dbl>   <dbl>   <dbl>       <dbl>  <dbl> <chr>     
 1        1     4      338066  254644   77288        6134  0.229 Democrat  
 2        2     8      311397  128973  174066        8358  0.559 Republican
 3        3     3      309885  114314  195571           0  0.631 Republican
 4        4     4      348485  259534   88951           0  0.255 Democrat  
 5        5     3      349197  148252  200945           0  0.575 Republican
 6        6     4      364583  142467  222116           0  0.609 Republican
 7        7     4      336736  168695  168041           0  0.499 Democrat  
 8        8     8      301824  137139  160695        3990  0.532 Republican
 9        9    13      375690  171503  194537        9650  0.518 Republican
10       10     6      334849  144023  190826           0  0.570 Republican
11       11    11      331426  141107  190319           0  0.574 Republican
12       12     3      310908  247591   63317           0  0.204 Democrat  
13       13     5      370610  160115  210495           0  0.568 Republican

[Pause to ponder:]

  • Explain how sum(ifelse(party == "D", general_votes, 0), na.rm = TRUE) works
  • Explain why we use .groups = "drop". Hint: try excluding that line and running again.
  • Do you see any potential problems with ifelse(r_votes > d_votes, "Republican", "Democrat")?
  • What observations can you make about the final nc_results table?
# distribution of total number of votes is narrow by design
nc_results |>
  skim(total_votes) |>
  select(-na)

Variable type: numeric

var n mean sd p0 p25 p50 p75 p100
total_votes 13 337204.3 24175.2 301824 311397 336736 349197 375690
# compare total Dem and Rep votes across NC in 2012
nc_results |>
  summarize(
    N = n(), 
    state_votes = sum(total_votes), 
    state_d = sum(d_votes), 
    state_r = sum(r_votes)
  ) |>
  mutate(
    d_prop = state_d / state_votes, 
    r_prop = state_r / state_votes
  )
# A tibble: 1 × 6
      N state_votes state_d state_r d_prop r_prop
  <int>       <dbl>   <dbl>   <dbl>  <dbl>  <dbl>
1    13     4383656 2218357 2137167  0.506  0.488
# Proportion of Rep votes by district
nc_results |>
  select(district, r_prop, winner) |>
  arrange(desc(r_prop))
# A tibble: 13 × 3
   district r_prop winner    
      <dbl>  <dbl> <chr>     
 1        3  0.631 Republican
 2        6  0.609 Republican
 3        5  0.575 Republican
 4       11  0.574 Republican
 5       10  0.570 Republican
 6       13  0.568 Republican
 7        2  0.559 Republican
 8        8  0.532 Republican
 9        9  0.518 Republican
10        7  0.499 Democrat  
11        4  0.255 Democrat  
12        1  0.229 Democrat  
13       12  0.204 Democrat  

Now let’s layer the results above on a map of North Carolina to create an effective visualization of the situation. How does the shape of districts where Republicans won compare with the shape where Democrats won?

# Download congressional district shapefiles for the 113th Congress from a UCLA website (don't sweat the details too much)
src <- "http://cdmaps.polisci.ucla.edu/shp/districts113.zip"
lcl_zip <- fs::path(tempdir(), "districts113.zip")
download.file(src, destfile = lcl_zip)
lcl_districts <- fs::path(tempdir(), "districts113")
unzip(lcl_zip, exdir = lcl_districts)
dsn_districts <- fs::path(lcl_districts, "districtShapes")

# You can also downloaded zip file and uploaded it into R, but this uses a ton of space!
# dsn_districts <- fs::path("Data/districtShapes")

# read shapefiles into R as an sf object
st_layers(dsn_districts)
Driver: ESRI Shapefile 
Available layers:
    layer_name geometry_type features fields crs_name
1 districts113       Polygon      436     15    NAD83
# be able to read as a data frame as well
districts <- st_read(dsn_districts, layer = "districts113") |>
  mutate(DISTRICT = parse_number(as.character(DISTRICT)))
Reading layer `districts113' from data source 
  `C:\Users\roback\AppData\Local\Temp\RtmpE7UScU\districts113\districtShapes' 
  using driver `ESRI Shapefile'
Simple feature collection with 436 features and 15 fields (with 1 geometry empty)
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -179.1473 ymin: 18.91383 xmax: 179.7785 ymax: 71.35256
Geodetic CRS:  NAD83
head(districts, width = Inf)
Simple feature collection with 6 features and 15 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -91.82307 ymin: 29.41135 xmax: -66.94983 ymax: 47.45969
Geodetic CRS:  NAD83
  STATENAME           ID DISTRICT STARTCONG ENDCONG DISTRICTSI COUNTY PAGE  LAW
1 Louisiana 022113114006        6       113     114       <NA>   <NA> <NA> <NA>
2     Maine 023113114001        1       113     114       <NA>   <NA> <NA> <NA>
3     Maine 023113114002        2       113     114       <NA>   <NA> <NA> <NA>
4  Maryland 024113114001        1       113     114       <NA>   <NA> <NA> <NA>
5  Maryland 024113114002        2       113     114       <NA>   <NA> <NA> <NA>
6  Maryland 024113114003        3       113     114       <NA>   <NA> <NA> <NA>
  NOTE BESTDEC                  FINALNOTE RNOTE                 LASTCHANGE
1 <NA>    <NA> {"From US Census website"}  <NA> 2016-05-29 16:44:10.857626
2 <NA>    <NA> {"From US Census website"}  <NA> 2016-05-29 16:44:10.857626
3 <NA>    <NA> {"From US Census website"}  <NA> 2016-05-29 16:44:10.857626
4 <NA>    <NA> {"From US Census website"}  <NA> 2016-05-29 16:44:10.857626
5 <NA>    <NA> {"From US Census website"}  <NA> 2016-05-29 16:44:10.857626
6 <NA>    <NA> {"From US Census website"}  <NA> 2016-05-29 16:44:10.857626
  FROMCOUNTY                       geometry
1          F MULTIPOLYGON (((-91.82288 3...
2          F MULTIPOLYGON (((-70.98905 4...
3          F MULTIPOLYGON (((-71.08216 4...
4          F MULTIPOLYGON (((-77.31156 3...
5          F MULTIPOLYGON (((-76.8763 39...
6          F MULTIPOLYGON (((-77.15622 3...
class(districts)
[1] "sf"         "data.frame"
# create basic plot with NC congressional districts
nc_shp <- districts |>
  filter(STATENAME == "North Carolina")
nc_shp |>
  st_geometry() |>
  plot(col = gray.colors(nrow(nc_shp)))

# Append election results to geospatial data
nc_merged <- nc_shp |>
  st_transform(4326) |>
  inner_join(nc_results, by = c("DISTRICT" = "district"))
head(nc_merged, width = Inf)
Simple feature collection with 6 features and 23 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -81.91811 ymin: 34.19118 xmax: -75.45998 ymax: 36.58812
Geodetic CRS:  WGS 84
       STATENAME           ID DISTRICT STARTCONG ENDCONG DISTRICTSI COUNTY PAGE
1 North Carolina 037113114002        2       113     114       <NA>   <NA> <NA>
2 North Carolina 037113114003        3       113     114       <NA>   <NA> <NA>
3 North Carolina 037113114004        4       113     114       <NA>   <NA> <NA>
4 North Carolina 037113114001        1       113     114       <NA>   <NA> <NA>
5 North Carolina 037113114005        5       113     114       <NA>   <NA> <NA>
6 North Carolina 037113114006        6       113     114       <NA>   <NA> <NA>
   LAW NOTE BESTDEC                  FINALNOTE RNOTE                 LASTCHANGE
1 <NA> <NA>    <NA> {"From US Census website"}  <NA> 2016-05-29 16:44:10.857626
2 <NA> <NA>    <NA> {"From US Census website"}  <NA> 2016-05-29 16:44:10.857626
3 <NA> <NA>    <NA> {"From US Census website"}  <NA> 2016-05-29 16:44:10.857626
4 <NA> <NA>    <NA> {"From US Census website"}  <NA> 2016-05-29 16:44:10.857626
5 <NA> <NA>    <NA> {"From US Census website"}  <NA> 2016-05-29 16:44:10.857626
6 <NA> <NA>    <NA> {"From US Census website"}  <NA> 2016-05-29 16:44:10.857626
  FROMCOUNTY state N total_votes d_votes r_votes other_votes    r_prop
1          F    NC 8      311397  128973  174066        8358 0.5589842
2          F    NC 3      309885  114314  195571           0 0.6311083
3          F    NC 4      348485  259534   88951           0 0.2552506
4          F    NC 4      338066  254644   77288        6134 0.2286181
5          F    NC 3      349197  148252  200945           0 0.5754488
6          F    NC 4      364583  142467  222116           0 0.6092330
      winner                       geometry
1 Republican MULTIPOLYGON (((-80.05325 3...
2 Republican MULTIPOLYGON (((-78.27217 3...
3   Democrat MULTIPOLYGON (((-79.47249 3...
4   Democrat MULTIPOLYGON (((-78.95837 3...
5 Republican MULTIPOLYGON (((-81.91805 3...
6 Republican MULTIPOLYGON (((-80.97462 3...
# Color based on winning party
#   Note that geom_sf is part of ggplot2 package, while st_geometry is
#   part of sf package
nc <- ggplot(data = nc_merged, aes(fill = winner)) +
  annotation_map_tile(zoom = 6, type = "osm", progress = "none") + 
  geom_sf(alpha = 0.5) +
  scale_fill_manual("Winner", values = c("blue", "red")) + 
  geom_sf_label(aes(label = DISTRICT), fill = "white") + 
  theme_void()
nc

# Color based on proportion Rep.  Be sure to let limits so centered at 0.5.
# This is a choropleth map, where meaningful shading relates to some attribute
nc +
  aes(fill = r_prop) + 
  scale_fill_distiller(
    "Proportion\nRepublican", 
    palette = "RdBu", 
    limits = c(0.2, 0.8)
  )

# A leaflet map can allow us to zoom in and see where major cities fit, etc.
library(leaflet)
pal <- colorNumeric(palette = "RdBu", domain = c(0, 1))

leaflet_nc <- leaflet(nc_merged) |>
  addTiles() |>
  addPolygons(
    weight = 1, fillOpacity = 0.7, 
    color = ~pal(1 - r_prop),   # so red association with Reps
    popup = ~paste("District", DISTRICT, "</br>", round(r_prop, 4))
  ) |>                          # popups show prop Republican
  setView(lng = -80, lat = 35, zoom = 7)
leaflet_nc

[Pause to ponder:] What have you learned by layering the voting data on the voting districts of North Carolina?